{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Use Dueling DQN to Play MoutainCar-v0\n",
    "\n",
    "PyTorch version"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "\n",
    "import sys\n",
    "import logging\n",
    "import itertools\n",
    "import copy\n",
    "\n",
    "import numpy as np\n",
    "np.random.seed(0)\n",
    "import pandas as pd\n",
    "import gym\n",
    "import matplotlib.pyplot as plt\n",
    "import torch\n",
    "torch.manual_seed(0)\n",
    "import torch.nn as nn\n",
    "import torch.optim as optim\n",
    "\n",
    "logging.basicConfig(level=logging.DEBUG,\n",
    "        format='%(asctime)s [%(levelname)s] %(message)s',\n",
    "        stream=sys.stdout, datefmt='%H:%M:%S')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "22:49:57 [INFO] env: <MountainCarEnv<MountainCar-v0>>\n",
      "22:49:57 [INFO] action_space: Discrete(3)\n",
      "22:49:57 [INFO] observation_space: Box(-1.2000000476837158, 0.6000000238418579, (2,), float32)\n",
      "22:49:57 [INFO] reward_range: (-inf, inf)\n",
      "22:49:57 [INFO] metadata: {'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 30}\n",
      "22:49:57 [INFO] _max_episode_steps: 200\n",
      "22:49:57 [INFO] _elapsed_steps: None\n",
      "22:49:57 [INFO] id: MountainCar-v0\n",
      "22:49:57 [INFO] entry_point: gym.envs.classic_control:MountainCarEnv\n",
      "22:49:57 [INFO] reward_threshold: -110.0\n",
      "22:49:57 [INFO] nondeterministic: False\n",
      "22:49:57 [INFO] max_episode_steps: 200\n",
      "22:49:57 [INFO] _kwargs: {}\n",
      "22:49:57 [INFO] _env_name: MountainCar\n"
     ]
    }
   ],
   "source": [
    "env = gym.make('MountainCar-v0')\n",
    "env.seed(0)\n",
    "for key in vars(env):\n",
    "    logging.info('%s: %s', key, vars(env)[key])\n",
    "for key in vars(env.spec):\n",
    "    logging.info('%s: %s', key, vars(env.spec)[key])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "class DQNReplayer:\n",
    "    def __init__(self, capacity):\n",
    "        self.memory = pd.DataFrame(index=range(capacity),\n",
    "                columns=['state', 'action', 'reward', 'next_state', 'done'])\n",
    "        self.i = 0\n",
    "        self.count = 0\n",
    "        self.capacity = capacity\n",
    "\n",
    "    def store(self, *args):\n",
    "        self.memory.loc[self.i] = args\n",
    "        self.i = (self.i + 1) % self.capacity\n",
    "        self.count = min(self.count + 1, self.capacity)\n",
    "\n",
    "    def sample(self, size):\n",
    "        indices = np.random.choice(self.count, size=size)\n",
    "        return (np.stack(self.memory.loc[indices, field]) for field in\n",
    "                self.memory.columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "class DuelNet(nn.Module):\n",
    "    def __init__(self, input_size, output_size):\n",
    "        super().__init__()\n",
    "        self.common_net = nn.Sequential(nn.Linear(input_size, 64), nn.ReLU())\n",
    "        self.advantage_net = nn.Sequential(nn.Linear(64, 32), nn.ReLU(),\n",
    "            nn.Linear(32, output_size))\n",
    "        self.v_net = nn.Sequential(nn.Linear(64, 32), nn.ReLU(), nn.Linear(32, 1))\n",
    "\n",
    "    def forward(self, s):\n",
    "        h = self.common_net(s)\n",
    "        adv = self.advantage_net(h)\n",
    "        adv = adv - adv.mean(1).unsqueeze(1)\n",
    "        v = self.v_net(h)\n",
    "        q = v + adv\n",
    "        return q\n",
    "\n",
    "\n",
    "class DuelDQNAgent:\n",
    "    def __init__(self, env):\n",
    "        self.action_n = env.action_space.n\n",
    "        self.gamma = 0.99\n",
    "\n",
    "        self.replayer = DQNReplayer(10000)\n",
    "\n",
    "        self.evaluate_net = DuelNet(input_size=env.observation_space.shape[0],\n",
    "                output_size=self.action_n)\n",
    "        self.optimizer = optim.Adam(self.evaluate_net.parameters(), lr=0.001)\n",
    "        self.loss = nn.MSELoss()\n",
    "\n",
    "    def reset(self, mode=None):\n",
    "        self.mode = mode\n",
    "        if self.mode == 'train':\n",
    "            self.trajectory = []\n",
    "            self.target_net = copy.deepcopy(self.evaluate_net)\n",
    "\n",
    "    def step(self, observation, reward, done):\n",
    "        if self.mode == 'train' and np.random.rand() < 0.001:\n",
    "            # epsilon-greedy policy in train mode\n",
    "            action = np.random.randint(self.action_n)\n",
    "        else:\n",
    "            state_tensor = torch.as_tensor(observation,\n",
    "                    dtype=torch.float).reshape(1, -1)\n",
    "            q_tensor = self.evaluate_net(state_tensor)\n",
    "            action_tensor = torch.argmax(q_tensor)\n",
    "            action = action_tensor.item()\n",
    "        if self.mode == 'train':\n",
    "            self.trajectory += [observation, reward, done, action]\n",
    "            if len(self.trajectory) >= 8:\n",
    "                state, _, _, act, next_state, reward, done, _ = \\\n",
    "                        self.trajectory[-8:]\n",
    "                self.replayer.store(state, act, reward, next_state, done)\n",
    "            if self.replayer.count >= self.replayer.capacity * 0.95:\n",
    "                    # skip first few episodes for speed\n",
    "                self.learn()\n",
    "        return action\n",
    "\n",
    "    def close(self):\n",
    "        pass\n",
    "\n",
    "    def learn(self):\n",
    "        # replay\n",
    "        states, actions, rewards, next_states, dones = \\\n",
    "                self.replayer.sample(1024) # replay transitions\n",
    "        state_tensor = torch.as_tensor(states, dtype=torch.float)\n",
    "        action_tensor = torch.as_tensor(actions, dtype=torch.long)\n",
    "        reward_tensor = torch.as_tensor(rewards, dtype=torch.float)\n",
    "        next_state_tensor = torch.as_tensor(next_states, dtype=torch.float)\n",
    "        done_tensor = torch.as_tensor(dones, dtype=torch.float)\n",
    "\n",
    "        # train\n",
    "        next_eval_q_tensor = self.evaluate_net(next_state_tensor)\n",
    "        next_action_tensor = next_eval_q_tensor.argmax(axis=-1)\n",
    "        next_q_tensor = self.target_net(next_state_tensor)\n",
    "        next_max_q_tensor = torch.gather(next_q_tensor, 1,\n",
    "                next_action_tensor.unsqueeze(1)).squeeze(1)\n",
    "        target_tensor = reward_tensor + self.gamma * (1. - done_tensor) * \\\n",
    "                next_max_q_tensor\n",
    "        pred_tensor = self.evaluate_net(state_tensor)\n",
    "        unsqueeze_tensor = action_tensor.unsqueeze(1)\n",
    "        q_tensor = pred_tensor.gather(1, action_tensor.unsqueeze(1)).squeeze(1)\n",
    "        loss_tensor = self.loss(target_tensor, q_tensor)\n",
    "        self.optimizer.zero_grad()\n",
    "        loss_tensor.backward()\n",
    "        self.optimizer.step()\n",
    "\n",
    "\n",
    "agent = DuelDQNAgent(env)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "22:49:57 [INFO] ==== train ====\n",
      "22:49:58 [DEBUG] train episode 0: reward = -200.00, steps = 200\n",
      "22:49:58 [DEBUG] train episode 1: reward = -200.00, steps = 200\n",
      "22:49:58 [DEBUG] train episode 2: reward = -200.00, steps = 200\n",
      "22:49:58 [DEBUG] train episode 3: reward = -200.00, steps = 200\n",
      "22:49:59 [DEBUG] train episode 4: reward = -200.00, steps = 200\n",
      "22:49:59 [DEBUG] train episode 5: reward = -200.00, steps = 200\n",
      "22:49:59 [DEBUG] train episode 6: reward = -200.00, steps = 200\n",
      "22:49:59 [DEBUG] train episode 7: reward = -200.00, steps = 200\n",
      "22:50:00 [DEBUG] train episode 8: reward = -200.00, steps = 200\n",
      "22:50:00 [DEBUG] train episode 9: reward = -200.00, steps = 200\n",
      "22:50:00 [DEBUG] train episode 10: reward = -200.00, steps = 200\n",
      "22:50:00 [DEBUG] train episode 11: reward = -200.00, steps = 200\n",
      "22:50:01 [DEBUG] train episode 12: reward = -200.00, steps = 200\n",
      "22:50:01 [DEBUG] train episode 13: reward = -200.00, steps = 200\n",
      "22:50:01 [DEBUG] train episode 14: reward = -200.00, steps = 200\n",
      "22:50:01 [DEBUG] train episode 15: reward = -200.00, steps = 200\n",
      "22:50:02 [DEBUG] train episode 16: reward = -200.00, steps = 200\n",
      "22:50:02 [DEBUG] train episode 17: reward = -200.00, steps = 200\n",
      "22:50:02 [DEBUG] train episode 18: reward = -200.00, steps = 200\n",
      "22:50:02 [DEBUG] train episode 19: reward = -200.00, steps = 200\n",
      "22:50:03 [DEBUG] train episode 20: reward = -200.00, steps = 200\n",
      "22:50:03 [DEBUG] train episode 21: reward = -200.00, steps = 200\n",
      "22:50:03 [DEBUG] train episode 22: reward = -200.00, steps = 200\n",
      "22:50:03 [DEBUG] train episode 23: reward = -200.00, steps = 200\n",
      "22:50:04 [DEBUG] train episode 24: reward = -200.00, steps = 200\n",
      "22:50:04 [DEBUG] train episode 25: reward = -200.00, steps = 200\n",
      "22:50:04 [DEBUG] train episode 26: reward = -200.00, steps = 200\n",
      "22:50:04 [DEBUG] train episode 27: reward = -200.00, steps = 200\n",
      "22:50:05 [DEBUG] train episode 28: reward = -200.00, steps = 200\n",
      "22:50:05 [DEBUG] train episode 29: reward = -200.00, steps = 200\n",
      "22:50:05 [DEBUG] train episode 30: reward = -200.00, steps = 200\n",
      "22:50:05 [DEBUG] train episode 31: reward = -200.00, steps = 200\n",
      "22:50:06 [DEBUG] train episode 32: reward = -200.00, steps = 200\n",
      "22:50:06 [DEBUG] train episode 33: reward = -200.00, steps = 200\n",
      "22:50:06 [DEBUG] train episode 34: reward = -200.00, steps = 200\n",
      "22:50:06 [DEBUG] train episode 35: reward = -200.00, steps = 200\n",
      "22:50:07 [DEBUG] train episode 36: reward = -200.00, steps = 200\n",
      "22:50:07 [DEBUG] train episode 37: reward = -200.00, steps = 200\n",
      "22:50:07 [DEBUG] train episode 38: reward = -200.00, steps = 200\n",
      "22:50:07 [DEBUG] train episode 39: reward = -200.00, steps = 200\n",
      "22:50:08 [DEBUG] train episode 40: reward = -200.00, steps = 200\n",
      "22:50:08 [DEBUG] train episode 41: reward = -200.00, steps = 200\n",
      "22:50:08 [DEBUG] train episode 42: reward = -200.00, steps = 200\n",
      "22:50:08 [DEBUG] train episode 43: reward = -200.00, steps = 200\n",
      "22:50:08 [DEBUG] train episode 44: reward = -200.00, steps = 200\n",
      "22:50:09 [DEBUG] train episode 45: reward = -200.00, steps = 200\n",
      "22:50:09 [DEBUG] train episode 46: reward = -200.00, steps = 200\n",
      "22:50:30 [DEBUG] train episode 47: reward = -200.00, steps = 200\n",
      "22:51:47 [DEBUG] train episode 48: reward = -200.00, steps = 200\n",
      "22:53:04 [DEBUG] train episode 49: reward = -200.00, steps = 200\n",
      "22:54:25 [DEBUG] train episode 50: reward = -200.00, steps = 200\n",
      "22:55:44 [DEBUG] train episode 51: reward = -200.00, steps = 200\n",
      "22:56:59 [DEBUG] train episode 52: reward = -200.00, steps = 200\n",
      "22:58:16 [DEBUG] train episode 53: reward = -200.00, steps = 200\n",
      "22:59:33 [DEBUG] train episode 54: reward = -200.00, steps = 200\n",
      "23:01:09 [DEBUG] train episode 55: reward = -200.00, steps = 200\n",
      "23:02:37 [DEBUG] train episode 56: reward = -200.00, steps = 200\n",
      "23:04:08 [DEBUG] train episode 57: reward = -200.00, steps = 200\n",
      "23:05:34 [DEBUG] train episode 58: reward = -200.00, steps = 200\n",
      "23:06:59 [DEBUG] train episode 59: reward = -200.00, steps = 200\n",
      "23:08:21 [DEBUG] train episode 60: reward = -200.00, steps = 200\n",
      "23:09:46 [DEBUG] train episode 61: reward = -200.00, steps = 200\n",
      "23:11:20 [DEBUG] train episode 62: reward = -200.00, steps = 200\n",
      "23:13:39 [DEBUG] train episode 63: reward = -200.00, steps = 200\n",
      "23:16:13 [DEBUG] train episode 64: reward = -200.00, steps = 200\n",
      "23:18:45 [DEBUG] train episode 65: reward = -200.00, steps = 200\n",
      "23:21:17 [DEBUG] train episode 66: reward = -200.00, steps = 200\n",
      "23:23:54 [DEBUG] train episode 67: reward = -200.00, steps = 200\n",
      "23:26:30 [DEBUG] train episode 68: reward = -200.00, steps = 200\n",
      "23:29:03 [DEBUG] train episode 69: reward = -200.00, steps = 200\n",
      "23:31:40 [DEBUG] train episode 70: reward = -200.00, steps = 200\n",
      "23:34:10 [DEBUG] train episode 71: reward = -200.00, steps = 200\n",
      "23:36:44 [DEBUG] train episode 72: reward = -200.00, steps = 200\n",
      "23:39:23 [DEBUG] train episode 73: reward = -200.00, steps = 200\n",
      "23:41:58 [DEBUG] train episode 74: reward = -200.00, steps = 200\n",
      "23:44:52 [DEBUG] train episode 75: reward = -200.00, steps = 200\n",
      "23:47:43 [DEBUG] train episode 76: reward = -200.00, steps = 200\n",
      "23:50:38 [DEBUG] train episode 77: reward = -200.00, steps = 200\n",
      "23:53:27 [DEBUG] train episode 78: reward = -200.00, steps = 200\n",
      "23:56:03 [DEBUG] train episode 79: reward = -200.00, steps = 200\n",
      "23:58:36 [DEBUG] train episode 80: reward = -200.00, steps = 200\n",
      "00:01:10 [DEBUG] train episode 81: reward = -200.00, steps = 200\n",
      "00:03:43 [DEBUG] train episode 82: reward = -200.00, steps = 200\n",
      "00:06:14 [DEBUG] train episode 83: reward = -200.00, steps = 200\n",
      "00:08:43 [DEBUG] train episode 84: reward = -200.00, steps = 200\n",
      "00:11:15 [DEBUG] train episode 85: reward = -200.00, steps = 200\n",
      "00:13:47 [DEBUG] train episode 86: reward = -200.00, steps = 200\n",
      "00:16:17 [DEBUG] train episode 87: reward = -200.00, steps = 200\n",
      "00:18:48 [DEBUG] train episode 88: reward = -200.00, steps = 200\n",
      "00:21:19 [DEBUG] train episode 89: reward = -200.00, steps = 200\n",
      "00:23:51 [DEBUG] train episode 90: reward = -200.00, steps = 200\n",
      "00:26:22 [DEBUG] train episode 91: reward = -200.00, steps = 200\n",
      "00:28:56 [DEBUG] train episode 92: reward = -200.00, steps = 200\n",
      "00:31:25 [DEBUG] train episode 93: reward = -200.00, steps = 200\n",
      "00:34:01 [DEBUG] train episode 94: reward = -200.00, steps = 200\n",
      "00:36:30 [DEBUG] train episode 95: reward = -200.00, steps = 200\n",
      "00:39:03 [DEBUG] train episode 96: reward = -200.00, steps = 200\n",
      "00:41:28 [DEBUG] train episode 97: reward = -200.00, steps = 200\n",
      "00:43:58 [DEBUG] train episode 98: reward = -200.00, steps = 200\n",
      "00:46:27 [DEBUG] train episode 99: reward = -200.00, steps = 200\n",
      "00:48:55 [DEBUG] train episode 100: reward = -200.00, steps = 200\n",
      "00:51:24 [DEBUG] train episode 101: reward = -200.00, steps = 200\n",
      "00:53:54 [DEBUG] train episode 102: reward = -200.00, steps = 200\n",
      "00:56:23 [DEBUG] train episode 103: reward = -200.00, steps = 200\n",
      "00:58:28 [DEBUG] train episode 104: reward = -200.00, steps = 200\n",
      "01:00:56 [DEBUG] train episode 105: reward = -200.00, steps = 200\n",
      "01:03:24 [DEBUG] train episode 106: reward = -200.00, steps = 200\n",
      "01:05:52 [DEBUG] train episode 107: reward = -200.00, steps = 200\n",
      "01:08:23 [DEBUG] train episode 108: reward = -200.00, steps = 200\n",
      "01:10:53 [DEBUG] train episode 109: reward = -200.00, steps = 200\n",
      "01:13:22 [DEBUG] train episode 110: reward = -200.00, steps = 200\n",
      "01:15:52 [DEBUG] train episode 111: reward = -200.00, steps = 200\n",
      "01:18:20 [DEBUG] train episode 112: reward = -200.00, steps = 200\n",
      "01:20:49 [DEBUG] train episode 113: reward = -200.00, steps = 200\n",
      "01:23:18 [DEBUG] train episode 114: reward = -200.00, steps = 200\n",
      "01:25:46 [DEBUG] train episode 115: reward = -200.00, steps = 200\n",
      "01:28:20 [DEBUG] train episode 116: reward = -200.00, steps = 200\n",
      "01:30:50 [DEBUG] train episode 117: reward = -200.00, steps = 200\n",
      "01:33:20 [DEBUG] train episode 118: reward = -200.00, steps = 200\n",
      "01:35:48 [DEBUG] train episode 119: reward = -200.00, steps = 200\n",
      "01:38:18 [DEBUG] train episode 120: reward = -200.00, steps = 200\n",
      "01:40:48 [DEBUG] train episode 121: reward = -200.00, steps = 200\n",
      "01:43:18 [DEBUG] train episode 122: reward = -200.00, steps = 200\n",
      "01:45:47 [DEBUG] train episode 123: reward = -200.00, steps = 200\n",
      "01:48:15 [DEBUG] train episode 124: reward = -200.00, steps = 200\n",
      "01:50:44 [DEBUG] train episode 125: reward = -200.00, steps = 200\n",
      "01:53:13 [DEBUG] train episode 126: reward = -200.00, steps = 200\n",
      "01:55:44 [DEBUG] train episode 127: reward = -200.00, steps = 200\n",
      "01:58:13 [DEBUG] train episode 128: reward = -200.00, steps = 200\n",
      "02:00:41 [DEBUG] train episode 129: reward = -200.00, steps = 200\n",
      "02:03:09 [DEBUG] train episode 130: reward = -200.00, steps = 200\n",
      "02:05:37 [DEBUG] train episode 131: reward = -200.00, steps = 200\n",
      "02:08:07 [DEBUG] train episode 132: reward = -200.00, steps = 200\n",
      "02:10:37 [DEBUG] train episode 133: reward = -200.00, steps = 200\n",
      "02:13:07 [DEBUG] train episode 134: reward = -200.00, steps = 200\n",
      "02:15:35 [DEBUG] train episode 135: reward = -200.00, steps = 200\n",
      "02:18:04 [DEBUG] train episode 136: reward = -200.00, steps = 200\n",
      "02:20:33 [DEBUG] train episode 137: reward = -200.00, steps = 200\n",
      "02:23:03 [DEBUG] train episode 138: reward = -200.00, steps = 200\n",
      "02:25:33 [DEBUG] train episode 139: reward = -200.00, steps = 200\n",
      "02:28:06 [DEBUG] train episode 140: reward = -200.00, steps = 200\n",
      "02:30:33 [DEBUG] train episode 141: reward = -200.00, steps = 200\n",
      "02:33:00 [DEBUG] train episode 142: reward = -200.00, steps = 200\n",
      "02:35:28 [DEBUG] train episode 143: reward = -200.00, steps = 200\n",
      "02:37:57 [DEBUG] train episode 144: reward = -200.00, steps = 200\n",
      "02:40:11 [DEBUG] train episode 145: reward = -200.00, steps = 200\n",
      "02:42:28 [DEBUG] train episode 146: reward = -200.00, steps = 200\n",
      "02:44:55 [DEBUG] train episode 147: reward = -200.00, steps = 200\n",
      "02:47:22 [DEBUG] train episode 148: reward = -200.00, steps = 200\n",
      "02:49:50 [DEBUG] train episode 149: reward = -200.00, steps = 200\n",
      "02:52:19 [DEBUG] train episode 150: reward = -200.00, steps = 200\n",
      "02:54:50 [DEBUG] train episode 151: reward = -200.00, steps = 200\n",
      "02:57:05 [DEBUG] train episode 152: reward = -200.00, steps = 200\n",
      "02:59:02 [DEBUG] train episode 153: reward = -200.00, steps = 200\n",
      "03:01:00 [DEBUG] train episode 154: reward = -200.00, steps = 200\n",
      "03:02:58 [DEBUG] train episode 155: reward = -200.00, steps = 200\n",
      "03:04:56 [DEBUG] train episode 156: reward = -200.00, steps = 200\n",
      "03:06:54 [DEBUG] train episode 157: reward = -200.00, steps = 200\n",
      "03:08:52 [DEBUG] train episode 158: reward = -200.00, steps = 200\n",
      "03:10:50 [DEBUG] train episode 159: reward = -200.00, steps = 200\n",
      "03:12:48 [DEBUG] train episode 160: reward = -200.00, steps = 200\n",
      "03:14:45 [DEBUG] train episode 161: reward = -200.00, steps = 200\n",
      "03:16:42 [DEBUG] train episode 162: reward = -200.00, steps = 200\n",
      "03:18:38 [DEBUG] train episode 163: reward = -200.00, steps = 200\n",
      "03:20:31 [DEBUG] train episode 164: reward = -200.00, steps = 200\n",
      "03:22:23 [DEBUG] train episode 165: reward = -200.00, steps = 200\n",
      "03:24:15 [DEBUG] train episode 166: reward = -200.00, steps = 200\n",
      "03:26:07 [DEBUG] train episode 167: reward = -200.00, steps = 200\n",
      "03:28:01 [DEBUG] train episode 168: reward = -200.00, steps = 200\n",
      "03:29:52 [DEBUG] train episode 169: reward = -200.00, steps = 200\n",
      "03:31:44 [DEBUG] train episode 170: reward = -200.00, steps = 200\n",
      "03:33:34 [DEBUG] train episode 171: reward = -200.00, steps = 200\n",
      "03:35:24 [DEBUG] train episode 172: reward = -200.00, steps = 200\n",
      "03:37:14 [DEBUG] train episode 173: reward = -200.00, steps = 200\n",
      "03:39:05 [DEBUG] train episode 174: reward = -200.00, steps = 200\n",
      "03:40:57 [DEBUG] train episode 175: reward = -200.00, steps = 200\n",
      "03:42:48 [DEBUG] train episode 176: reward = -200.00, steps = 200\n",
      "03:44:38 [DEBUG] train episode 177: reward = -200.00, steps = 200\n",
      "03:46:28 [DEBUG] train episode 178: reward = -200.00, steps = 200\n",
      "03:48:18 [DEBUG] train episode 179: reward = -200.00, steps = 200\n",
      "03:50:08 [DEBUG] train episode 180: reward = -200.00, steps = 200\n",
      "03:51:59 [DEBUG] train episode 181: reward = -200.00, steps = 200\n",
      "03:53:50 [DEBUG] train episode 182: reward = -200.00, steps = 200\n",
      "03:55:41 [DEBUG] train episode 183: reward = -200.00, steps = 200\n",
      "03:57:33 [DEBUG] train episode 184: reward = -200.00, steps = 200\n",
      "03:59:23 [DEBUG] train episode 185: reward = -200.00, steps = 200\n",
      "04:01:15 [DEBUG] train episode 186: reward = -200.00, steps = 200\n",
      "04:03:04 [DEBUG] train episode 187: reward = -200.00, steps = 200\n",
      "04:04:51 [DEBUG] train episode 188: reward = -200.00, steps = 200\n",
      "04:06:39 [DEBUG] train episode 189: reward = -200.00, steps = 200\n",
      "04:08:29 [DEBUG] train episode 190: reward = -200.00, steps = 200\n",
      "04:10:29 [DEBUG] train episode 191: reward = -200.00, steps = 200\n",
      "04:12:17 [DEBUG] train episode 192: reward = -200.00, steps = 200\n",
      "04:14:07 [DEBUG] train episode 193: reward = -200.00, steps = 200\n",
      "04:15:56 [DEBUG] train episode 194: reward = -200.00, steps = 200\n",
      "04:17:44 [DEBUG] train episode 195: reward = -200.00, steps = 200\n",
      "04:19:23 [DEBUG] train episode 196: reward = -200.00, steps = 200\n",
      "04:20:48 [DEBUG] train episode 197: reward = -200.00, steps = 200\n",
      "04:22:11 [DEBUG] train episode 198: reward = -200.00, steps = 200\n",
      "04:23:34 [DEBUG] train episode 199: reward = -200.00, steps = 200\n",
      "04:24:57 [DEBUG] train episode 200: reward = -200.00, steps = 200\n",
      "04:26:19 [DEBUG] train episode 201: reward = -200.00, steps = 200\n",
      "04:27:45 [DEBUG] train episode 202: reward = -200.00, steps = 200\n",
      "04:29:08 [DEBUG] train episode 203: reward = -200.00, steps = 200\n",
      "04:30:30 [DEBUG] train episode 204: reward = -200.00, steps = 200\n",
      "04:31:49 [DEBUG] train episode 205: reward = -200.00, steps = 200\n",
      "04:33:01 [DEBUG] train episode 206: reward = -200.00, steps = 200\n",
      "04:34:13 [DEBUG] train episode 207: reward = -200.00, steps = 200\n",
      "04:35:24 [DEBUG] train episode 208: reward = -200.00, steps = 200\n",
      "04:36:34 [DEBUG] train episode 209: reward = -200.00, steps = 200\n",
      "04:37:33 [DEBUG] train episode 210: reward = -200.00, steps = 200\n",
      "04:38:32 [DEBUG] train episode 211: reward = -200.00, steps = 200\n",
      "04:39:30 [DEBUG] train episode 212: reward = -200.00, steps = 200\n",
      "04:40:29 [DEBUG] train episode 213: reward = -200.00, steps = 200\n",
      "04:41:28 [DEBUG] train episode 214: reward = -200.00, steps = 200\n",
      "04:42:27 [DEBUG] train episode 215: reward = -200.00, steps = 200\n",
      "04:43:27 [DEBUG] train episode 216: reward = -200.00, steps = 200\n",
      "04:44:26 [DEBUG] train episode 217: reward = -200.00, steps = 200\n",
      "04:45:24 [DEBUG] train episode 218: reward = -200.00, steps = 200\n",
      "04:46:23 [DEBUG] train episode 219: reward = -200.00, steps = 200\n",
      "04:47:23 [DEBUG] train episode 220: reward = -200.00, steps = 200\n",
      "04:48:22 [DEBUG] train episode 221: reward = -200.00, steps = 200\n",
      "04:49:20 [DEBUG] train episode 222: reward = -200.00, steps = 200\n",
      "04:50:14 [DEBUG] train episode 223: reward = -200.00, steps = 200\n",
      "04:50:59 [DEBUG] train episode 224: reward = -200.00, steps = 200\n",
      "04:51:44 [DEBUG] train episode 225: reward = -200.00, steps = 200\n",
      "04:52:29 [DEBUG] train episode 226: reward = -200.00, steps = 200\n",
      "04:53:14 [DEBUG] train episode 227: reward = -200.00, steps = 200\n",
      "04:53:59 [DEBUG] train episode 228: reward = -200.00, steps = 200\n",
      "04:54:42 [DEBUG] train episode 229: reward = -200.00, steps = 200\n",
      "04:55:21 [DEBUG] train episode 230: reward = -200.00, steps = 200\n",
      "04:55:58 [DEBUG] train episode 231: reward = -200.00, steps = 200\n",
      "04:56:37 [DEBUG] train episode 232: reward = -200.00, steps = 200\n",
      "04:57:15 [DEBUG] train episode 233: reward = -200.00, steps = 200\n",
      "04:57:54 [DEBUG] train episode 234: reward = -200.00, steps = 200\n",
      "04:58:33 [DEBUG] train episode 235: reward = -200.00, steps = 200\n",
      "04:59:12 [DEBUG] train episode 236: reward = -200.00, steps = 200\n",
      "04:59:41 [DEBUG] train episode 237: reward = -200.00, steps = 200\n",
      "05:00:07 [DEBUG] train episode 238: reward = -200.00, steps = 200\n",
      "05:00:29 [DEBUG] train episode 239: reward = -200.00, steps = 200\n",
      "05:01:00 [DEBUG] train episode 240: reward = -200.00, steps = 200\n",
      "05:01:39 [DEBUG] train episode 241: reward = -200.00, steps = 200\n",
      "05:02:17 [DEBUG] train episode 242: reward = -200.00, steps = 200\n",
      "05:02:56 [DEBUG] train episode 243: reward = -200.00, steps = 200\n",
      "05:03:24 [DEBUG] train episode 244: reward = -200.00, steps = 200\n",
      "05:04:02 [DEBUG] train episode 245: reward = -200.00, steps = 200\n",
      "05:04:40 [DEBUG] train episode 246: reward = -200.00, steps = 200\n",
      "05:05:18 [DEBUG] train episode 247: reward = -200.00, steps = 200\n",
      "05:05:46 [DEBUG] train episode 248: reward = -200.00, steps = 200\n",
      "05:06:26 [DEBUG] train episode 249: reward = -200.00, steps = 200\n",
      "05:07:04 [DEBUG] train episode 250: reward = -200.00, steps = 200\n",
      "05:07:33 [DEBUG] train episode 251: reward = -200.00, steps = 200\n",
      "05:07:51 [DEBUG] train episode 252: reward = -200.00, steps = 200\n",
      "05:08:16 [DEBUG] train episode 253: reward = -200.00, steps = 200\n",
      "05:08:55 [DEBUG] train episode 254: reward = -200.00, steps = 200\n",
      "05:09:20 [DEBUG] train episode 255: reward = -200.00, steps = 200\n",
      "05:09:50 [DEBUG] train episode 256: reward = -200.00, steps = 200\n",
      "05:10:09 [DEBUG] train episode 257: reward = -200.00, steps = 200\n",
      "05:10:44 [DEBUG] train episode 258: reward = -200.00, steps = 200\n",
      "05:11:23 [DEBUG] train episode 259: reward = -200.00, steps = 200\n",
      "05:11:52 [DEBUG] train episode 260: reward = -200.00, steps = 200\n",
      "05:12:30 [DEBUG] train episode 261: reward = -200.00, steps = 200\n",
      "05:13:09 [DEBUG] train episode 262: reward = -200.00, steps = 200\n",
      "05:13:47 [DEBUG] train episode 263: reward = -200.00, steps = 200\n",
      "05:14:15 [DEBUG] train episode 264: reward = -200.00, steps = 200\n",
      "05:14:45 [DEBUG] train episode 265: reward = -200.00, steps = 200\n",
      "05:15:24 [DEBUG] train episode 266: reward = -200.00, steps = 200\n",
      "05:15:49 [DEBUG] train episode 267: reward = -200.00, steps = 200\n",
      "05:16:18 [DEBUG] train episode 268: reward = -200.00, steps = 200\n",
      "05:16:56 [DEBUG] train episode 269: reward = -200.00, steps = 200\n",
      "05:17:23 [DEBUG] train episode 270: reward = -200.00, steps = 200\n",
      "05:17:54 [DEBUG] train episode 271: reward = -200.00, steps = 200\n",
      "05:18:23 [DEBUG] train episode 272: reward = -200.00, steps = 200\n",
      "05:19:01 [DEBUG] train episode 273: reward = -200.00, steps = 200\n",
      "05:19:39 [DEBUG] train episode 274: reward = -200.00, steps = 200\n",
      "05:20:18 [DEBUG] train episode 275: reward = -200.00, steps = 200\n",
      "05:20:57 [DEBUG] train episode 276: reward = -200.00, steps = 200\n",
      "05:21:36 [DEBUG] train episode 277: reward = -200.00, steps = 200\n",
      "05:22:14 [DEBUG] train episode 278: reward = -200.00, steps = 200\n",
      "05:22:53 [DEBUG] train episode 279: reward = -200.00, steps = 200\n",
      "05:23:31 [DEBUG] train episode 280: reward = -200.00, steps = 200\n",
      "05:24:09 [DEBUG] train episode 281: reward = -200.00, steps = 200\n",
      "05:24:48 [DEBUG] train episode 282: reward = -200.00, steps = 200\n",
      "05:25:14 [DEBUG] train episode 283: reward = -200.00, steps = 200\n",
      "05:25:52 [DEBUG] train episode 284: reward = -200.00, steps = 200\n",
      "05:26:31 [DEBUG] train episode 285: reward = -200.00, steps = 200\n",
      "05:26:57 [DEBUG] train episode 286: reward = -200.00, steps = 200\n",
      "05:27:34 [DEBUG] train episode 287: reward = -200.00, steps = 200\n",
      "05:28:07 [DEBUG] train episode 288: reward = -200.00, steps = 200\n",
      "05:28:42 [DEBUG] train episode 289: reward = -200.00, steps = 200\n",
      "05:29:22 [DEBUG] train episode 290: reward = -200.00, steps = 200\n",
      "05:29:56 [DEBUG] train episode 291: reward = -200.00, steps = 200\n",
      "05:30:31 [DEBUG] train episode 292: reward = -200.00, steps = 200\n",
      "05:31:04 [DEBUG] train episode 293: reward = -200.00, steps = 200\n",
      "05:31:17 [DEBUG] train episode 294: reward = -200.00, steps = 200\n",
      "05:31:29 [DEBUG] train episode 295: reward = -200.00, steps = 200\n",
      "05:31:42 [DEBUG] train episode 296: reward = -200.00, steps = 200\n",
      "05:31:54 [DEBUG] train episode 297: reward = -200.00, steps = 200\n",
      "05:32:06 [DEBUG] train episode 298: reward = -200.00, steps = 200\n",
      "05:32:19 [DEBUG] train episode 299: reward = -200.00, steps = 200\n",
      "05:32:25 [DEBUG] train episode 300: reward = -92.00, steps = 92\n",
      "05:32:37 [DEBUG] train episode 301: reward = -200.00, steps = 200\n",
      "05:32:49 [DEBUG] train episode 302: reward = -200.00, steps = 200\n",
      "05:33:08 [DEBUG] train episode 303: reward = -200.00, steps = 200\n",
      "05:33:28 [DEBUG] train episode 304: reward = -200.00, steps = 200\n",
      "05:33:49 [DEBUG] train episode 305: reward = -200.00, steps = 200\n",
      "05:34:10 [DEBUG] train episode 306: reward = -200.00, steps = 200\n",
      "05:34:28 [DEBUG] train episode 307: reward = -200.00, steps = 200\n",
      "05:34:48 [DEBUG] train episode 308: reward = -200.00, steps = 200\n",
      "05:35:08 [DEBUG] train episode 309: reward = -200.00, steps = 200\n",
      "05:35:28 [DEBUG] train episode 310: reward = -200.00, steps = 200\n",
      "05:35:46 [DEBUG] train episode 311: reward = -200.00, steps = 200\n",
      "05:36:07 [DEBUG] train episode 312: reward = -200.00, steps = 200\n",
      "05:36:27 [DEBUG] train episode 313: reward = -200.00, steps = 200\n",
      "05:36:48 [DEBUG] train episode 314: reward = -200.00, steps = 200\n",
      "05:37:09 [DEBUG] train episode 315: reward = -200.00, steps = 200\n",
      "05:37:30 [DEBUG] train episode 316: reward = -200.00, steps = 200\n",
      "05:37:50 [DEBUG] train episode 317: reward = -200.00, steps = 200\n",
      "05:38:11 [DEBUG] train episode 318: reward = -200.00, steps = 200\n",
      "05:38:31 [DEBUG] train episode 319: reward = -200.00, steps = 200\n",
      "05:38:52 [DEBUG] train episode 320: reward = -200.00, steps = 200\n",
      "05:39:14 [DEBUG] train episode 321: reward = -200.00, steps = 200\n",
      "05:39:33 [DEBUG] train episode 322: reward = -200.00, steps = 200\n",
      "05:39:54 [DEBUG] train episode 323: reward = -200.00, steps = 200\n",
      "05:40:15 [DEBUG] train episode 324: reward = -200.00, steps = 200\n",
      "05:40:35 [DEBUG] train episode 325: reward = -200.00, steps = 200\n",
      "05:40:54 [DEBUG] train episode 326: reward = -200.00, steps = 200\n",
      "05:41:14 [DEBUG] train episode 327: reward = -200.00, steps = 200\n",
      "05:41:35 [DEBUG] train episode 328: reward = -200.00, steps = 200\n",
      "05:41:55 [DEBUG] train episode 329: reward = -200.00, steps = 200\n",
      "05:42:16 [DEBUG] train episode 330: reward = -200.00, steps = 200\n",
      "05:42:36 [DEBUG] train episode 331: reward = -200.00, steps = 200\n",
      "05:42:56 [DEBUG] train episode 332: reward = -200.00, steps = 200\n",
      "05:43:16 [DEBUG] train episode 333: reward = -200.00, steps = 200\n",
      "05:43:36 [DEBUG] train episode 334: reward = -200.00, steps = 200\n",
      "05:43:56 [DEBUG] train episode 335: reward = -200.00, steps = 200\n",
      "05:44:17 [DEBUG] train episode 336: reward = -200.00, steps = 200\n",
      "05:44:38 [DEBUG] train episode 337: reward = -200.00, steps = 200\n",
      "05:44:58 [DEBUG] train episode 338: reward = -200.00, steps = 200\n",
      "05:45:18 [DEBUG] train episode 339: reward = -200.00, steps = 200\n",
      "05:45:38 [DEBUG] train episode 340: reward = -200.00, steps = 200\n",
      "05:45:59 [DEBUG] train episode 341: reward = -200.00, steps = 200\n",
      "05:46:19 [DEBUG] train episode 342: reward = -200.00, steps = 200\n",
      "05:46:40 [DEBUG] train episode 343: reward = -200.00, steps = 200\n",
      "05:47:00 [DEBUG] train episode 344: reward = -200.00, steps = 200\n",
      "05:47:20 [DEBUG] train episode 345: reward = -200.00, steps = 200\n",
      "05:47:37 [DEBUG] train episode 346: reward = -200.00, steps = 200\n",
      "05:47:47 [DEBUG] train episode 347: reward = -200.00, steps = 200\n",
      "05:47:57 [DEBUG] train episode 348: reward = -200.00, steps = 200\n",
      "05:48:07 [DEBUG] train episode 349: reward = -200.00, steps = 200\n",
      "05:48:17 [DEBUG] train episode 350: reward = -200.00, steps = 200\n",
      "05:48:26 [DEBUG] train episode 351: reward = -200.00, steps = 200\n",
      "05:48:37 [DEBUG] train episode 352: reward = -200.00, steps = 200\n",
      "05:48:46 [DEBUG] train episode 353: reward = -200.00, steps = 200\n",
      "05:48:54 [DEBUG] train episode 354: reward = -200.00, steps = 200\n",
      "05:49:02 [DEBUG] train episode 355: reward = -200.00, steps = 200\n",
      "05:49:10 [DEBUG] train episode 356: reward = -200.00, steps = 200\n",
      "05:49:17 [DEBUG] train episode 357: reward = -200.00, steps = 200\n",
      "05:49:25 [DEBUG] train episode 358: reward = -200.00, steps = 200\n",
      "05:49:30 [DEBUG] train episode 359: reward = -114.00, steps = 114\n",
      "05:49:34 [DEBUG] train episode 360: reward = -113.00, steps = 113\n",
      "05:49:38 [DEBUG] train episode 361: reward = -102.00, steps = 102\n",
      "05:49:45 [DEBUG] train episode 362: reward = -178.00, steps = 178\n",
      "05:49:49 [DEBUG] train episode 363: reward = -98.00, steps = 98\n",
      "05:49:53 [DEBUG] train episode 364: reward = -94.00, steps = 94\n",
      "05:50:01 [DEBUG] train episode 365: reward = -191.00, steps = 191\n",
      "05:50:08 [DEBUG] train episode 366: reward = -174.00, steps = 174\n",
      "05:50:16 [DEBUG] train episode 367: reward = -200.00, steps = 200\n",
      "05:50:26 [DEBUG] train episode 368: reward = -200.00, steps = 200\n",
      "05:50:34 [DEBUG] train episode 369: reward = -200.00, steps = 200\n",
      "05:50:42 [DEBUG] train episode 370: reward = -200.00, steps = 200\n",
      "05:50:50 [DEBUG] train episode 371: reward = -200.00, steps = 200\n",
      "05:50:57 [DEBUG] train episode 372: reward = -165.00, steps = 165\n",
      "05:51:04 [DEBUG] train episode 373: reward = -172.00, steps = 172\n",
      "05:51:12 [DEBUG] train episode 374: reward = -181.00, steps = 181\n",
      "05:51:20 [DEBUG] train episode 375: reward = -200.00, steps = 200\n",
      "05:51:28 [DEBUG] train episode 376: reward = -200.00, steps = 200\n",
      "05:51:37 [DEBUG] train episode 377: reward = -200.00, steps = 200\n",
      "05:51:45 [DEBUG] train episode 378: reward = -200.00, steps = 200\n",
      "05:51:53 [DEBUG] train episode 379: reward = -200.00, steps = 200\n",
      "05:52:01 [DEBUG] train episode 380: reward = -183.00, steps = 183\n",
      "05:52:09 [DEBUG] train episode 381: reward = -200.00, steps = 200\n",
      "05:52:13 [DEBUG] train episode 382: reward = -93.00, steps = 93\n",
      "05:52:17 [DEBUG] train episode 383: reward = -92.00, steps = 92\n",
      "05:52:22 [DEBUG] train episode 384: reward = -108.00, steps = 108\n",
      "05:52:25 [DEBUG] train episode 385: reward = -90.00, steps = 90\n",
      "05:52:34 [DEBUG] train episode 386: reward = -200.00, steps = 200\n",
      "05:52:38 [DEBUG] train episode 387: reward = -90.00, steps = 90\n",
      "05:52:44 [DEBUG] train episode 388: reward = -156.00, steps = 156\n",
      "05:52:52 [DEBUG] train episode 389: reward = -170.00, steps = 170\n",
      "05:52:55 [DEBUG] train episode 390: reward = -91.00, steps = 91\n",
      "05:53:02 [DEBUG] train episode 391: reward = -153.00, steps = 153\n",
      "05:53:05 [DEBUG] train episode 392: reward = -88.00, steps = 88\n",
      "05:53:11 [DEBUG] train episode 393: reward = -146.00, steps = 146\n",
      "05:53:15 [DEBUG] train episode 394: reward = -85.00, steps = 85\n",
      "05:53:19 [DEBUG] train episode 395: reward = -96.00, steps = 96\n",
      "05:53:22 [DEBUG] train episode 396: reward = -86.00, steps = 86\n",
      "05:53:28 [DEBUG] train episode 397: reward = -140.00, steps = 140\n",
      "05:53:34 [DEBUG] train episode 398: reward = -146.00, steps = 146\n",
      "05:53:41 [DEBUG] train episode 399: reward = -137.00, steps = 137\n",
      "05:53:44 [DEBUG] train episode 400: reward = -85.00, steps = 85\n",
      "05:53:50 [DEBUG] train episode 401: reward = -137.00, steps = 137\n",
      "05:53:56 [DEBUG] train episode 402: reward = -145.00, steps = 145\n",
      "05:53:59 [DEBUG] train episode 403: reward = -85.00, steps = 85\n",
      "05:54:05 [DEBUG] train episode 404: reward = -142.00, steps = 142\n",
      "05:54:11 [DEBUG] train episode 405: reward = -140.00, steps = 140\n",
      "05:54:17 [DEBUG] train episode 406: reward = -137.00, steps = 137\n",
      "05:54:22 [DEBUG] train episode 407: reward = -139.00, steps = 139\n",
      "05:54:28 [DEBUG] train episode 408: reward = -140.00, steps = 140\n",
      "05:54:32 [DEBUG] train episode 409: reward = -87.00, steps = 87\n",
      "05:54:38 [DEBUG] train episode 410: reward = -149.00, steps = 149\n",
      "05:54:44 [DEBUG] train episode 411: reward = -142.00, steps = 142\n",
      "05:54:48 [DEBUG] train episode 412: reward = -103.00, steps = 103\n",
      "05:54:55 [DEBUG] train episode 413: reward = -144.00, steps = 144\n",
      "05:54:58 [DEBUG] train episode 414: reward = -92.00, steps = 92\n",
      "05:55:03 [DEBUG] train episode 415: reward = -106.00, steps = 106\n",
      "05:55:07 [DEBUG] train episode 416: reward = -92.00, steps = 92\n",
      "05:55:11 [DEBUG] train episode 417: reward = -88.00, steps = 88\n",
      "05:55:18 [DEBUG] train episode 418: reward = -149.00, steps = 149\n",
      "05:55:22 [DEBUG] train episode 419: reward = -84.00, steps = 84\n",
      "05:55:25 [DEBUG] train episode 420: reward = -95.00, steps = 95\n",
      "05:55:25 [INFO] ==== test ====\n",
      "05:55:26 [DEBUG] test episode 0: reward = -156.00, steps = 156\n",
      "05:55:26 [DEBUG] test episode 1: reward = -94.00, steps = 94\n",
      "05:55:26 [DEBUG] test episode 2: reward = -87.00, steps = 87\n",
      "05:55:26 [DEBUG] test episode 3: reward = -147.00, steps = 147\n",
      "05:55:26 [DEBUG] test episode 4: reward = -157.00, steps = 157\n",
      "05:55:26 [DEBUG] test episode 5: reward = -146.00, steps = 146\n",
      "05:55:26 [DEBUG] test episode 6: reward = -89.00, steps = 89\n",
      "05:55:26 [DEBUG] test episode 7: reward = -85.00, steps = 85\n",
      "05:55:26 [DEBUG] test episode 8: reward = -158.00, steps = 158\n",
      "05:55:27 [DEBUG] test episode 9: reward = -146.00, steps = 146\n",
      "05:55:27 [DEBUG] test episode 10: reward = -157.00, steps = 157\n",
      "05:55:27 [DEBUG] test episode 11: reward = -146.00, steps = 146\n",
      "05:55:27 [DEBUG] test episode 12: reward = -86.00, steps = 86\n",
      "05:55:27 [DEBUG] test episode 13: reward = -149.00, steps = 149\n",
      "05:55:27 [DEBUG] test episode 14: reward = -148.00, steps = 148\n",
      "05:55:27 [DEBUG] test episode 15: reward = -86.00, steps = 86\n",
      "05:55:27 [DEBUG] test episode 16: reward = -146.00, steps = 146\n",
      "05:55:27 [DEBUG] test episode 17: reward = -87.00, steps = 87\n",
      "05:55:27 [DEBUG] test episode 18: reward = -157.00, steps = 157\n",
      "05:55:28 [DEBUG] test episode 19: reward = -148.00, steps = 148\n",
      "05:55:28 [DEBUG] test episode 20: reward = -147.00, steps = 147\n",
      "05:55:28 [DEBUG] test episode 21: reward = -98.00, steps = 98\n",
      "05:55:28 [DEBUG] test episode 22: reward = -155.00, steps = 155\n",
      "05:55:28 [DEBUG] test episode 23: reward = -146.00, steps = 146\n",
      "05:55:28 [DEBUG] test episode 24: reward = -89.00, steps = 89\n",
      "05:55:28 [DEBUG] test episode 25: reward = -157.00, steps = 157\n",
      "05:55:28 [DEBUG] test episode 26: reward = -146.00, steps = 146\n",
      "05:55:28 [DEBUG] test episode 27: reward = -88.00, steps = 88\n",
      "05:55:29 [DEBUG] test episode 28: reward = -146.00, steps = 146\n",
      "05:55:29 [DEBUG] test episode 29: reward = -146.00, steps = 146\n",
      "05:55:29 [DEBUG] test episode 30: reward = -95.00, steps = 95\n",
      "05:55:29 [DEBUG] test episode 31: reward = -86.00, steps = 86\n",
      "05:55:29 [DEBUG] test episode 32: reward = -147.00, steps = 147\n",
      "05:55:29 [DEBUG] test episode 33: reward = -91.00, steps = 91\n",
      "05:55:29 [DEBUG] test episode 34: reward = -155.00, steps = 155\n",
      "05:55:29 [DEBUG] test episode 35: reward = -86.00, steps = 86\n",
      "05:55:29 [DEBUG] test episode 36: reward = -146.00, steps = 146\n",
      "05:55:29 [DEBUG] test episode 37: reward = -146.00, steps = 146\n",
      "05:55:30 [DEBUG] test episode 38: reward = -157.00, steps = 157\n",
      "05:55:30 [DEBUG] test episode 39: reward = -90.00, steps = 90\n",
      "05:55:30 [DEBUG] test episode 40: reward = -87.00, steps = 87\n",
      "05:55:30 [DEBUG] test episode 41: reward = -89.00, steps = 89\n",
      "05:55:30 [DEBUG] test episode 42: reward = -147.00, steps = 147\n",
      "05:55:30 [DEBUG] test episode 43: reward = -147.00, steps = 147\n",
      "05:55:30 [DEBUG] test episode 44: reward = -103.00, steps = 103\n",
      "05:55:30 [DEBUG] test episode 45: reward = -146.00, steps = 146\n",
      "05:55:30 [DEBUG] test episode 46: reward = -146.00, steps = 146\n",
      "05:55:30 [DEBUG] test episode 47: reward = -155.00, steps = 155\n",
      "05:55:31 [DEBUG] test episode 48: reward = -89.00, steps = 89\n",
      "05:55:31 [DEBUG] test episode 49: reward = -159.00, steps = 159\n",
      "05:55:31 [DEBUG] test episode 50: reward = -88.00, steps = 88\n",
      "05:55:31 [DEBUG] test episode 51: reward = -98.00, steps = 98\n",
      "05:55:31 [DEBUG] test episode 52: reward = -92.00, steps = 92\n",
      "05:55:31 [DEBUG] test episode 53: reward = -145.00, steps = 145\n",
      "05:55:31 [DEBUG] test episode 54: reward = -94.00, steps = 94\n",
      "05:55:31 [DEBUG] test episode 55: reward = -156.00, steps = 156\n",
      "05:55:31 [DEBUG] test episode 56: reward = -145.00, steps = 145\n",
      "05:55:31 [DEBUG] test episode 57: reward = -90.00, steps = 90\n",
      "05:55:32 [DEBUG] test episode 58: reward = -86.00, steps = 86\n",
      "05:55:32 [DEBUG] test episode 59: reward = -98.00, steps = 98\n",
      "05:55:32 [DEBUG] test episode 60: reward = -155.00, steps = 155\n",
      "05:55:32 [DEBUG] test episode 61: reward = -158.00, steps = 158\n",
      "05:55:32 [DEBUG] test episode 62: reward = -88.00, steps = 88\n",
      "05:55:32 [DEBUG] test episode 63: reward = -146.00, steps = 146\n",
      "05:55:32 [DEBUG] test episode 64: reward = -147.00, steps = 147\n",
      "05:55:32 [DEBUG] test episode 65: reward = -156.00, steps = 156\n",
      "05:55:32 [DEBUG] test episode 66: reward = -91.00, steps = 91\n",
      "05:55:32 [DEBUG] test episode 67: reward = -85.00, steps = 85\n",
      "05:55:33 [DEBUG] test episode 68: reward = -146.00, steps = 146\n",
      "05:55:33 [DEBUG] test episode 69: reward = -146.00, steps = 146\n",
      "05:55:33 [DEBUG] test episode 70: reward = -85.00, steps = 85\n",
      "05:55:33 [DEBUG] test episode 71: reward = -86.00, steps = 86\n",
      "05:55:33 [DEBUG] test episode 72: reward = -147.00, steps = 147\n",
      "05:55:33 [DEBUG] test episode 73: reward = -86.00, steps = 86\n",
      "05:55:33 [DEBUG] test episode 74: reward = -88.00, steps = 88\n",
      "05:55:33 [DEBUG] test episode 75: reward = -158.00, steps = 158\n",
      "05:55:33 [DEBUG] test episode 76: reward = -91.00, steps = 91\n",
      "05:55:33 [DEBUG] test episode 77: reward = -156.00, steps = 156\n",
      "05:55:34 [DEBUG] test episode 78: reward = -147.00, steps = 147\n",
      "05:55:34 [DEBUG] test episode 79: reward = -147.00, steps = 147\n",
      "05:55:34 [DEBUG] test episode 80: reward = -146.00, steps = 146\n",
      "05:55:34 [DEBUG] test episode 81: reward = -89.00, steps = 89\n",
      "05:55:34 [DEBUG] test episode 82: reward = -146.00, steps = 146\n",
      "05:55:34 [DEBUG] test episode 83: reward = -146.00, steps = 146\n",
      "05:55:34 [DEBUG] test episode 84: reward = -152.00, steps = 152\n",
      "05:55:34 [DEBUG] test episode 85: reward = -87.00, steps = 87\n",
      "05:55:34 [DEBUG] test episode 86: reward = -156.00, steps = 156\n",
      "05:55:35 [DEBUG] test episode 87: reward = -158.00, steps = 158\n",
      "05:55:35 [DEBUG] test episode 88: reward = -147.00, steps = 147\n",
      "05:55:35 [DEBUG] test episode 89: reward = -156.00, steps = 156\n",
      "05:55:35 [DEBUG] test episode 90: reward = -157.00, steps = 157\n",
      "05:55:35 [DEBUG] test episode 91: reward = -153.00, steps = 153\n",
      "05:55:35 [DEBUG] test episode 92: reward = -148.00, steps = 148\n",
      "05:55:35 [DEBUG] test episode 93: reward = -147.00, steps = 147\n",
      "05:55:35 [DEBUG] test episode 94: reward = -145.00, steps = 145\n",
      "05:55:36 [DEBUG] test episode 95: reward = -147.00, steps = 147\n",
      "05:55:36 [DEBUG] test episode 96: reward = -159.00, steps = 159\n",
      "05:55:36 [DEBUG] test episode 97: reward = -89.00, steps = 89\n",
      "05:55:36 [DEBUG] test episode 98: reward = -146.00, steps = 146\n",
      "05:55:36 [DEBUG] test episode 99: reward = -147.00, steps = 147\n",
      "05:55:36 [INFO] average episode reward = -127.82 ± 29.65\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX8AAAD5CAYAAADP2jUWAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAAAlJElEQVR4nO3de7RkZXnn8e9TdS7ddNOCIhe7aYHYZgVax9AnLcx4W4racWK4KFnEFXCWJh2JWepkOUam18zEmTBZOokmTJSkF8aIZkSN9uASCYJRUacNHC5CIyKNFzjSQhMEmkt3n3PqmT/23lW7du267Lqcs99Tv89anVPn3buq9tmSp5569rPf19wdEREZL5XlPgAREVl6Cv4iImNIwV9EZAwp+IuIjCEFfxGRMaTgLyIyhiZG9cJm9hLgb4BVwALwB+5+U7ztEuDtwCLwLne/rtvrHXPMMX7SSSeN6nBFRFakW2655RF3f252fGTBH/gQ8AF3v9bM3hD//iozOxW4ADgNeB5wg5m90N0XO73YSSedxOzs7AgPV0Rk5TGzn+aNj7Ls48C6+PGzgAfjx2cDV7n7IXf/MbAX2DrC4xARkYxRZv7vAa4zsz8n+pD5t/H4euC7qf3m4rEWZrYd2A6wcePGkR2oiMi4GSj4m9kNwPE5m3YArwH+o7t/wcx+C/g4cBZgOfvnzjHh7juBnQAzMzOah0JEZEgGCv7ufla7bWZ2JfDu+NfPA1fEj+eAE1O7bqBREhIRkSUwypr/g8Ar48evBu6NH38JuMDMps3sZGATcNMIj0NERDJGWfP/PeCvzGwCOEhcu3f3u8zsc8D3iVpA39mt00dERIZrZMHf3b8NbGmz7VLg0lG9t4iIdKY7fEVESsjd+dzsAxxaGE1hRMFfRKSErv/+Q7zvH+/gI9ff233nPij4i4gM0YOPPcN/+MRNHDg4Xx+75It38PV7HuajX9/Lp3b/pKfXeWY+yvh/9tgzozjMkV7wFREZO5d97V6+cc9+vvS9B3nL1o2YGZ+56QE+c9MD9X0uPPOkrq8zPVEF4NC8yj4iIqU3PRGF1R279nDR3/XfxT49Gb3OoYXaUI4rS8FfRGSIpier9cffuveRjvs+dWiBW376aP7rxB8iB5X5i4iUXxK0e/Huq27nTZfv5tGnDrdsm6qONvNXzV9EZEi+cuc+vnHP/qYx9/bTku352eNAa3b/rXv3s+u2nwEK/iIipfcH/3Bry1iH2F9nmekuL/x441qB+vxFRAJU6yX6d3BoXhd8RURK4+EnDvKLpw7zo/1PdtyvNuBk9Cr7iIiUyNb/+bX642/+p1fx/Oesyd2vU+bvOUuZHM4E+1GVfRT8RUQGtO/xg3z1rodyt/VS9rHUGlcPHzjYtG1UZR8FfxGRAe269Wd8dvaBlvGpaqWnsk/6G8DPH28O/ocXVfMXkRL69r2PsO0vb2wpV6xUP/j5E7zp8v/XNPbU4YXcfacnKz1l/skul3zxDnbs2jPwMfZCmb+IDOQ/77qT+x99mn2PP9O27r2SvO0TN/NgJjtvZ9VkFe/wmZgE/eQDIj3/z6gp8xcR6ZG75wb+drn9qoKZ/1JS8BeRsfXPP3iIx55unVqhnaJtm6smqgr+IiJlsu/xZ3jb38/y3s9/r+fndJqqIc+qyWpPHxiD3gjWDwV/ERlLc7+IFknJm1StnbYhus2G6YlKTx8Y3YL/4qB3iuVQ8BeRsfTQE1Ht/tgjV/X8nKIJulnnUlGyqdM+x6ydZn4E7Z4K/iIylh564hAAx62b7vk5eXfkdlJzWOx0h299U/t9Pv27W1mVWiNgWBT8RWQsPRxn/sesLRD8C2b+NXdqPZRsOu1SyU75OSQK/iIylpKyT6UymuD6aycdTc17+8DoVPMf0eEp+IvIeErKPkW0i9F55aBqxXD3Lhdzo221jiX9Emb+Zna+md1lZjUzm8lsu8TM9prZPWb2+tT4FjO7M952mdmIvtOIiHRw4NA8UKx9s0jNf6IS3eDVU59/h9cta+a/BzgPuDE9aGanAhcApwHbgI+ZWXLF4nJgO7Ap/rdtwGMQEelbkTp+kX2rFaNW6+3GsE6vW8qav7vf7e735Gw6G7jK3Q+5+4+BvcBWMzsBWOfuuz36uL0SOGeQYxARGUSRa7jt9s0L3tWKUXMfuM9/VLWRUdX81wPpGYrm4rH18ePsuIjIsiiW+fe+c1Tz79LnX5/Yrf0+o8r8u87qaWY3AMfnbNrh7le3e1rOmHcYb/fe24lKRGzcuLHLkYqIFFdkaoUi3xIm4sy/t7l9lj7z7xr83f2sPl53Djgx9fsG4MF4fEPOeLv33gnsBJiZmVmGqY9EZKVK4m2hsk+BnSsFgn+nzH9UPTGjKvt8CbjAzKbN7GSiC7s3ufs+4ICZnRF3+VwEtPv2ICIydI88eYhzP/Yd9iVTMxeq+7QZzoybReWamndr40yeH1i3j5mda2ZzwJnANWZ2HYC73wV8Dvg+8E/AO909WYX4YuAKoovA9wHXDnIMIlIOyzEtcT8+e/MD3Hb/Y/UJ3Ypd8O1t74oZFaNr5p+e26fdQu3LVvPvxN13AbvabLsUuDRnfBbYPMj7ikj5BBL7OZRZbnIUrZ6VeubfW9nn2/fu57f+dnfutlHdCKU7fEVkKIrOdb9cshl2kRu3et3TzKIZPXvs8//mD/d3fK1RUPAXkaEII/TDoflBMv/8nbMfIEnm7z32+Xfao5Q1fxGRRCCJf07m37ueM3+Smn+3Pv94bp+OF3yV+YtIqYUR/Q9mMv9Cff4Fav7VIq2eHTqCQrvDV0TGTKiZf5HPrCLdPpa0eg48vYMyfxEpsUBif0vmX+i4C/X5E9f8e3jZ1D6b16/jva97Yf33UdX8B2r1FBFJhJL5H5zP1PyHML1DdrxSsZ5aPRt9/o19fn3zCRy5qhGalfmLSKkVXd92uSxFn7+RusO3h4nd0rskJaPG770fXxEK/iIyFKFk/oN1+0R7v+TEo5rHMy9SSfr826zhm/22kd6nYs0BX90+IlJqwQT/gfr8o5+/vfVEXvXLz02NN7+I1fv88y/mZj8P0vtUzLCR3dfboOAvIkMRStnn4BDu8M2G59YAn57bp/V1svunfzNl/iISkvHI/OOdrflC7GLOa3S64JuM5d3kFU0Kp5q/iMhQDdTt04j9zZl/Tnqf9PnnvX52KH2TV8Wab+xSt4+IlFoomf/BbLdPgefWg398QTexmBP8kz7/nso+qd+rFWX+IhKQUGr+hwdp9Yz/Rkv9X2gX/Nvf4dt6wbfxOPvBosxfREotlMw/q9AF33rm31yayQvwvVzwzbvJK1vzHxUFfxEZihBif7beD/2t4mjWXPNfbHkRx5JWz7w+/1p274ZszX9UFPxFZChCWMzly3fsaxkrVvNPyj691Pyt7bbsN4Xmm7yU+YtIQMof+uHr9zzccgG1/8y/W80/+rmQM19zdu/0h0G2pDQqCv4iMhQBJP4sLNZ41urJprF+Wj2B7pl/HP0XOmX+beb2UeYvIgEpf/Sveesds8U+tOKyT6YjJ+8O32R74bJPZXTtnWkK/iIyFCFk/u6NjLw+1k+3D93KPl7/kFnIuf03e67Sv2dn9RwVBX8RGYoAYj/gVAfI/NM1f5oy/9Z9K0Uy/6aav8o+IhKQYDL/7AXfgs+HXiZ2a5SX8mv+ze9da8r8WYI5PRX8RWRIQmj1rLm3ln36ucM3O7Fbm7l9om2t3T7Z3v906alqRmUJIvNAb2Fm55vZXWZWM7OZ1PhrzewWM7sz/vnq1LYt8fheM7vMlqK4JSIjV/7QHx1jdWg1/4bOrZ7FJnazQGr+e4DzgBsz448Ab3T3FwFvBT6V2nY5sB3YFP/bNuAxiEgJBJD4405Lzb/Ip1a76R2ywd+92E1e86noX7HRzeGfNtAC7u5+N7ROPOTut6V+vQtYZWbTwLOBde6+O37elcA5wLWDHIeILL8QJnZzWm+g6rTAeuvz08u5dH6NTpl/dj7/lm6fno+of0tR838TcJu7HwLWA3OpbXPxmIiErvyxH3fPKfsUeX70s1vNP719YTGn5p9zc1ci6vMvQeZvZjcAx+ds2uHuV3d57mnAB4HXJUM5u7U992a2nahExMaNG7sdqogsowBif1M5Jj1WVG81//bdPp0ujketnsWPqaiuwd/dz+rnhc1sA7ALuMjd74uH54ANqd02AA92eO+dwE6AmZmZEP7bEhlbQdT8UzdfNcYKPD/euZKZ1rNon3+n9wz6Ji8zOwq4BrjE3b+TjLv7PuCAmZ0Rd/lcBHT89iAiYQih5l+r5XT79FHz7zaxW/obRueaf+t7VCyA6R3M7FwzmwPOBK4xs+viTX8IvAD4L2Z2e/zv2HjbxcAVwF7gPnSxV2RFCCbzH1rNvzHeOp9/Y/tizvQOOa3/dUuV+Q/a7bOLqLSTHf9T4E/bPGcW2DzI+4pI+QQQ++NWz+xggefHP1vu8C1Y8+/UYWQhZP4iIokQ7vB1ci74FrrJq3GXV7fMP7lLN+8O306nKuiav4iMn/KH/ih4Z8s+nUowLc+Pf2Zn9cyv3feX+VeWqNtHwV9EhiOA6J93h29/C7hby81iWY25fTpc8M157+oS9fkr+IvIUITQ7ePQMmlaX4u50H2pxc53+LZ/714+WIZBwV9EhiKAkn9U9hlCn3/0Ep0jdGMxl7yaf6dWT83nLyIBCSH41zxvLrLen9/U7TOEzL/dnEDK/EUkGAHE/mhit9zRHp+f7vPv9j491PzbLQKjzF9EghFCqyfuLVl1oczfi9T8O83tE/3MmxYiewPZqCj4i8hQBBD6czP/QjX/5EFmeoc8Hef2cW/7YanMX0SCEkLiX8u54FtoPv/6PV691Pw7r+Gbl/Unz1Ofv4gEpPzR3721pNL3Gr4d9kuXhXLX8HVv+6FTsdaL0qOg4C8iQxFC5h8dY/+tntQz/84BOj2NxELexG4dgr+p7CMiIQkg9lPLveBbZErnSC+Zeac1fN3bf1hWunyrGJaBZvUUEUmEkPnDYDNmNt/k1dv7tGv1bJf5VyvZOUNHQ8FfRIYiiOkdvLVLp6+aP90/AJJvB/O5Nf/8DwVYmnl9QMFfRIYkhMzfySn79H2TV4+tnm1r/vnPyy4OPyoK/iIyFAHEfmremln3M70DXVo901NHt1vAvVOf/1JQ8BeRoQjhDl+P6j5NivX599bqCd1q/p37/JeCgr+IjI3cO3z7mtitc83fUqtxtVvMpVOf/1JQ8BeRoQgg8Ye8WT0LPh+ag3s73Vo9O/X5LwUFfxEZihC6faLpHTKD/Xb7dNov9T4Lbe7w7dTnvxQU/EVkKELI/PMnduuv26db0b/fzL9itiQfowr+IjIUQQT/QRdzSU/s1iX6Wz3zb32Dg/OLPP7MfO7zouA/+pOp4C8iQxFA7I+md8iM9TOlcy9z7ieZf96Hy/u/eGfb51kFcM3tIyKBCKPVMy/zL97qCb20ejb2WD1Z7fk9gpjS2czON7O7zKxmZjM52zea2ZNm9t7U2BYzu9PM9prZZbZUl7ZFZKTKH/oj2YjTrt8+T7HMv/H42HXTPb9HtYdOomEYNPPfA5wH3Nhm+0eAazNjlwPbgU3xv20DHoOIlEEA0d8HLfsUqvk3th935Kqe38NsaTp+Bqr5u/vdkN+XambnAD8CnkqNnQCsc/fd8e9XAufQ+gEhIoEJo9UzJ2MvVK5K3eE7osx/qe7wHUnN38zWAH8MfCCzaT0wl/p9Lh4TkcAFUPLHaV3Gschh11Ktnt1CdDUV/Y8+Yqpl+29vPTH3eZWyLOBuZjeY2Z6cf2d3eNoHgI+4+5PZl8vZt+25N7PtZjZrZrP79+/vdqgisowCiP2DL+OYKvt0i9BrpxuFlTXTrUWW//bG0/j4W2c45Zg1TeNLtYB717KPu5/Vx+u+FHizmX0IOAqomdlB4AvAhtR+G4AHO7z3TmAnwMzMTAj/bYmMnSROhZH5Q+syjkVW8uptYjcHnrV6sv77mqnWbp9Vk1Ve8yvH8Rdf/WHTuBnYEpzLkfT5u/vLk8dm9ifAk+7+1/HvB8zsDOBfgIuA/z2KYxCRpRVCzd9zl3Es8vzoZy+LuUxUG4WVI3Iy/0T2dSyQVs9zzWwOOBO4xsyu6+FpFwNXAHuB+9DFXpEVIYjM31s7afqa1bOHxVygUfdPZ/6vPfU4Pv+OM+u/503/sBQ1/0G7fXYBu7rs8yeZ32eBzYO8r4iUTwCxP57bpzmy9jOff7fFXJJNU9UKz9QWmzL/v/mdLU0Xg586vND6/AD6/EVEIgGk/nlln3700u0DMFltzfyz3zyePNga/JeCgr+IDEX5Q3/rMo7Vio2k5p+85NREFGKPmGpk/tms/slDyxP8NbGbiAxFAIl/yzw+Feu326e3KRgm44u+a6bbz+0zHy/w/qm3b+VXTljX87EMSpm/iAxFEBO70Zyxm/Wf+fciCf6rJ6v8j3M2s/6o1W33PWbtNMes7f1O4EEp8xeRoSh/6I9v8kqF7mrBhVPSi7l0SvzXxGWe56yd4v5Hn8bMuPCM53PhGc9v+5wjVy1tOFbwF5GhCCDxb1peEeKyT5Fun/hnp4ndfv8Vp/CWl24E4KNvOZ0v3jrHLz13Te6+aetSN4UtBQV/ERlIEjsDiP0tZZ9KpWjm331it+2vOIXnxOWb5x21mj989aaeXnvt1NKGY9X8RWQogqj5ZxZzKdztk3rcrurT77w8laVauT15vyV9NxFZsQKI/S3LOFbMin1o9VDzD2V5KgV/ERmKIOb2oTnzrxS94Jtu9WyT+4eyOKFq/iIyFCFk/mSmdI4u+BZ4eg83eRWt3vzVBS/hQM5dvn/02hdy+saji71YAQr+IjIUYcT+nLJPoZu8Ih3n9SmY+Z/9kvz1rN71mt4uFPdLZR8RGYoQMv/sMo79T+/Q/g7fJb5u2zcFfxEZiiBq/t68jKMVLfv0sJjLUq3BOygFfxEZihAy/2hK54ZqwTS9yGIuZafgLyJjwzN3eVXNis3nnzzokPmH8qGg4C8iQ1H2m7yS40sn+0XLPsnOnWr+oVC3j4gMRcljf9PF2kS10n+3Tzb2v+DYtbzjlb/E9ET76ZvLRJm/iAxFyWN/bptmZYApnbN5//qjVvPmLRsGOMKlpeAvIkNR9sy/Vi/ZNFjhKZ0bd/hmU/9QWjwTCv4iMhRlb/VMPpzSE6hVK0VbPSN5mX9o1wAU/EVkKMqe+ed9OEU9+QVq/h0mdlPmLyJjJQmqJY/9TYE7UbjmH//MX8wlrOiv4C8iA6kHz5Kn/u26fQr1+aeu+GYz/8CqPgr+IjKYUFbySr6htCzj2Mdr5U3voLKPiIylkif+wyn7dJjeod38/mU1UPA3s/PN7C4zq5nZTGbbi81sd7z9TjNbFY9viX/fa2aXWWiXyEWkSVIKKX23T/wzHaSLruTV+PbQWvOvBJZKD3q4e4DzgBvTg2Y2AXwaeIe7nwa8CpiPN18ObAc2xf+2DXgMIrKMAin5N/r8mxZwL1b2afr2kElba7WBDm/JDRT83f1ud78nZ9PrgDvc/Xvxfv/q7otmdgKwzt13e/RxeyVwziDHICLLK5iafz1wN2f+RQ68udun2fxiWNF/VF9UXgi4mV1nZrea2fvi8fXAXGq/uXgsl5ltN7NZM5vdv3//iA5VRAZRb/UsffSPfrSu5FXgJZr6/JvD/+HAgn/Xid3M7Abg+JxNO9z96g6v+zLg14Cnga+Z2S3AEzn7tj337r4T2AkwMzNT9v+0RMZSI/Mv9/+L5pV9opW8itf8obXb5/DCCgv+7n5WH687B3zT3R8BMLOvAKcTXQdIz3y0AXiwj9cXkZLwlgfllJ6aIVExo9ZPt09On39omf+oyj7XAS82syPii7+vBL7v7vuAA2Z2RtzlcxHQ7tuDiAQgnJp/3KlTSdf8+/vGEs3n3zw2VjV/MzvXzOaAM4FrzOw6AHf/BfBh4GbgduBWd78mftrFwBXAXuA+4NpBjkFElltS8y93+G+X+Rfr80+v4dsc/ecXyv33Zw20mIu77wJ2tdn2aaIyT3Z8Ftg8yPuKSHnUM/+Sx75azl1e0WIuvet0k9dYZf4iIp75WVrJlM6ZZRz7avXMuTf1UGAXfBX8RWQg9Tt8Sx798+7wzVvGcbHm/N/bfkYt50pwc+afKfso8xeRcdLI/Msd/Xud2+ezNz/Aez57O5/67k9bX4N0zT8yVY3CqIK/iIyV0Gr+3W7yevJQNBPNA48+3fIa6buEkw+R6YkojIbW56/gLyIDKXuXTyI5ykrTBV9a5vNfMx31wTx1eKHta0CjfDSVBH9l/iIyThoTu5X7QyC9EEsir+yzZioO/ocW816knvFnM//5xXL//VkK/iIymGBu8op+pss+eV07qyarADx1KD/zT56R/JyO9w+Ngr+IDCSUKZ3zZvWMr9VmvrVEj3PLPt54fjbzD02YRy0ipRHOYi55yzhGv6Rjf1K6zyv71NxT3xyaa/6hCfOoRaQ0wsv8G2P14J/abzHesW3ZJ1PzD3UxQgV/ERlIKBO7NVo9M4u50Fz2WYyX5Gpb9omfnw351cBWcB9obh8RkVAWc2lMzdAYq9f8U/slZZ+nc8o+jtejfjrj33nhFjYdd+TwDnYJKPiLyEAaQb/c0T93Gcc4W0/3+ifTOuRl/qnY35T5v+60vPWuyk1lHxEZSCg1/+RIs3f4QuaCb/xL3iIveTX/UCn4i8hggpneIfrZfMG3db/FVNTP3rjm7o2av4K/iIyzes0/lLJPelbPnMw/XQLKTtMc9fm3vk6IFPxFZCChTOyW1+ef1P/Tk7KlM/9su2f6Dt/AY7+Cv4gMJpTFXDr1+f+b//7V+lg6+D99uLnjp+kO3xEd51JR8BeRgYSymEujnNM6vUP+fq0dP07jDt9Qb+5KKPiLyEBCW8wlr+yTlp6ZOTvFg6fqPmGHfgV/ERmQh1L3iTVP7JYX/BvR/+mcXv9G5j/0Q1tSCv4iMhRlj/15K3lVC2f+nqr5hx39FfxFpG/pPvjyL+YS/bSmsk/rfouevuCb0+2jm7xEZNyl4325Q3+7ZRxbI3gt3eqZ1+0TPw489iv4i0j/mpZAKXn0Ty/j+OYtG5KHdUmLZ1Pm39Ln3yj7hB79Bwr+Zna+md1lZjUzm0mNT5rZJ83sTjO728wuSW3bEo/vNbPLLPR+KZEx1lT2Wcbj6EWtEfv54JtezJ4PvL7p4u9CfKG3VvP6Ai2dM/+wQ9egmf8e4Dzgxsz4+cC0u78I2AL8vpmdFG+7HNgObIr/bRvwGERkmTRn/mUP//EFXzOqFWPt9ERT3b6e+deciYpxxFQ1J/NXzR8Ad7/b3e/J2wSsMbMJYDVwGHjCzE4A1rn7bo/+S7kSOGeQYxCR5RNUzb9Ln/9CquxTNWPN9ERu5k+bxVxCM6qa/z8CTwH7gPuBP3f3R4H1wFxqv7l4LJeZbTezWTOb3b9//4gOVUT61XRjV8mjf+P+3uwqvJHFxXgq55pTqRhrpqo5ff4e/PKNia6LuZjZDUDeSgU73P3qNk/bCiwCzwOOBr4Vv07e2Wr7n4y77wR2AszMzJT8Py2R8dOc+Zf7/0WTLp52rZ7zcc1/oeZUK8bqqYncO3xXyk1eXYO/u5/Vx+u+Bfgnd58HHjaz7wAzwLeADan9NgAP9vH6IlIyZS/5t87s0/wtIKn51zwK/lMTFeYXO03pHLZRlX3uB15tkTXAGcAP3H0fcMDMzoi7fC4C2n17EJGSa8r8yx78c5ZxTGfvC4uNC75VMyYr1hr80WIuAJjZuWY2B5wJXGNm18WbPgqsJeoGuhn4hLvfEW+7GLgC2AvcB1w7yDGIyPJJl3rKXvZxcso+qe2Nbp/o5q/JanPmf+v9v+Bzs3OpvzPs6D/QAu7uvgvYlTP+JFG7Z95zZoHNg7yviJRDkJl/m+0LqbJPpQKTExWefqZR899+5SwAjz09H71O2LFfd/iKSP+8zeMyyiv7LKSmckhu8krKPlNVYyGV+Wc/3AKP/Qr+ItK/5ondlvFAepC3jGM6uNdr/h61ek5UWi/4QvNKXyFT8BeRvnmH38qmPr1DOvinAnm92ye54DtRYX6x9W9aUPAXkXEXVs2/9UJtOrgvpKZ3iC74WtPC7iuNgr+I9C+cG3xTUzo3xtJlnfTcPhUzpqqV+nUACP8Cb5aCv4j0ranVs/Spf/Sj6YJvU80/vuAb3+Q1UbWmbwZl//OKUvAXkb6FNLFb3jKOhzuWfSrMq+wjItIqrMVcop/WptunZXqHaoXDOd0+K4WCv4j0LaTFXPKWcWzu889M71CtrJjOnjwK/iLSt5AWc8k7vuYLvslKXlCpwETVWKx5/RuBLviKiMRKHu+b5Pb5p2r+86mbvJKafzQefSiE9Lf2QsFfRPrW3O2zjAfSk+SCb7rPv3OrZ3aflUTBX0T6F9BiLvVlHFNR75ePP7L+eCFzwXeyGn1ILOTc5bsSKPiLSN9C6vapl31Smf/vvfwU/vbCLUCj5p9c8J1Q5i8iki+o6R1y5vOvVIxfPfEooLnbp1JplH1Warungr+I9C2oxVzazOdfrTSXdxZrzkTFmJyIxvMmd1sJFPxFpG9hZf4Ry/RsTsQXAeqZfzylc9Lts6DMX0SkWViLueT360/EF3Ybff5xzb/SXPaplf3TrSAFfxHpmwc0uU/Xsk8q869WjKlM2WellX8U/EWkbx5Sq2f9gm+27BNn/ovJYi7RFBDZm7xW2tz+Cv4iMhRlr4rU+/wzqX+S+c83zepJU/B39xXX9aPgLyJ9C6jqk9vnD9E3gWrFGn3+mZu85hd9RU7wpuAvIn0LaTGXdhd8ISr9LKTW8G0q+yzUVlzJBxT8RWQAIWX+jVbP1m0TFavX/BdqrRO7rcS7fBX8RaRvIU3vkLeMY6LaKfOvuTL/LDP7X2b2AzO7w8x2mdlRqW2XmNleM7vHzF6fGt9iZnfG2y6zvP8lRCQIIS3mkreMY2IitVj7osd3+CY1/4XairvYC4Nn/tcDm939xcAPgUsAzOxU4ALgNGAb8DEzq8bPuRzYDmyK/20b8BhEZJk0BfySp/6dyj7RBd/WNXwhKfuU+2/rx8QgT3b3r6Z+/S7w5vjx2cBV7n4I+LGZ7QW2mtlPgHXuvhvAzK4EzgGuHeQ4OvndT97MT//16VG9vMhYS2fEd//8AK/98DeX8Wg6e/yZeaC12wdgsmJ8+Y59zP7kFxxaqDVN7/AX1/+QI6aqLc9JWkSnJ8Ksng8U/DPeBnw2frye6MMgMRePzcePs+O5zGw70bcENm7c2NdBbXz2GqYC/R9HJASnbzya059/NLvve2S5D6WrY49cxbFHTreMb3/FKdz0k0cBeOHxR/KGzSdwzNop3v6yk9n3+DMAbNl4NC895dk8a/UUAC9a/yze9ZpNvGVrf7FpuVm39iwzuwE4PmfTDne/Ot5nBzADnOfubmYfBXa7+6fj7R8HvgLcD/yZu58Vj78ceJ+7v7Hbgc7MzPjs7Gzvf5mIiGBmt7j7THa8a+afBOoOL/xW4DeA13jjk2QOODG12wbgwXh8Q864iIgsoUG7fbYBfwz8prunC+tfAi4ws2kzO5nowu5N7r4POGBmZ8RdPhcBVw9yDCIiUtygNf+/BqaB6+OOze+6+zvc/S4z+xzwfWABeKe7L8bPuRj4e2A10YXekV3sFRGRfIN2+7ygw7ZLgUtzxmeBzYO8r4iIDEZtMCIiY0jBX0RkDCn4i4iMIQV/EZEx1PUmr7Iws/3AT/t8+jFA+W8/XD46P93pHHWm89Pdcp2j57v7c7ODwQT/QZjZbN4dbhLR+elO56gznZ/uynaOVPYRERlDCv4iImNoXIL/zuU+gJLT+elO56gznZ/uSnWOxqLmLyIizcYl8xcRkRQFfxGRMbSig7+ZbYsXkN9rZu9f7uNZLmb2d2b2sJntSY0928yuN7N7459Hp7ZdEp+ze8zs9ctz1EvHzE40s6+b2d1mdpeZvTse1zkCzGyVmd1kZt+Lz88H4nGdnwwzq5rZbWb25fj38p4jd1+R/4AqcB9wCjAFfA84dbmPa5nOxSuA04E9qbEPAe+PH78f+GD8+NT4XE0DJ8fnsLrcf8OIz88JwOnx4yOBH8bnQeco+nsNWBs/ngT+BThD5yf3XP0R8H+AL8e/l/YcreTMfyuw191/5O6HgauIFpYfO+5+I/BoZvhs4JPx408C56TGr3L3Q+7+Y2Av0blcsdx9n7vfGj8+ANxNtLa0zhHgkSfjXyfjf47OTxMz2wD8e+CK1HBpz9FKDv7rgQdSv3dcLH4MHefRymrEP4+Nx8f6vJnZScCvEmW3OkexuJxxO/AwcL276/y0+kvgfUAtNVbac7SSg7/ljKmvtbuxPW9mthb4AvAed3+i0645Yyv6HLn7oru/hGjd7a1m1mlBprE7P2b2G8DD7n5Lr0/JGVvSc7SSg3+7ReQl8pCZnQAQ/3w4Hh/L82Zmk0SB/x/c/YvxsM5Rhrs/BnwD2IbOT9q/A37TzH5CVGJ+tZl9mhKfo5Uc/G8GNpnZyWY2BVxAtLC8RL4EvDV+/Fbg6tT4BWY2bWYnA5uAm5bh+JaMRQtQfxy4290/nNqkcwSY2XPN7Kj48WrgLOAH6PzUufsl7r7B3U8iijX/7O6/Q5nP0XJfHR/xlfc3EHVu3AfsWO7jWcbz8BlgHzBPlHG8HXgO8DXg3vjns1P774jP2T3Ary/38S/B+XkZ0VfuO4Db439v0Dmq/60vBm6Lz88e4L/G4zo/+efrVTS6fUp7jjS9g4jIGFrJZR8REWlDwV9EZAwp+IuIjCEFfxGRMaTgLyIyhhT8RUTGkIK/iMgY+v8mgmBuzXvrEAAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "def play_episode(env, agent, max_episode_steps=None, mode=None, render=False):\n",
    "    observation, reward, done = env.reset(), 0., False\n",
    "    agent.reset(mode=mode)\n",
    "    episode_reward, elapsed_steps = 0., 0\n",
    "    while True:\n",
    "        action = agent.step(observation, reward, done)\n",
    "        if render:\n",
    "            env.render()\n",
    "        if done:\n",
    "            break\n",
    "        observation, reward, done, _ = env.step(action)\n",
    "        episode_reward += reward\n",
    "        elapsed_steps += 1\n",
    "        if max_episode_steps and elapsed_steps >= max_episode_steps:\n",
    "            break\n",
    "    agent.close()\n",
    "    return episode_reward, elapsed_steps\n",
    "\n",
    "\n",
    "logging.info('==== train ====')\n",
    "episode_rewards = []\n",
    "for episode in itertools.count():\n",
    "    episode_reward, elapsed_steps = play_episode(env.unwrapped, agent,\n",
    "            max_episode_steps=env.spec.max_episode_steps, mode='train')\n",
    "    episode_rewards.append(episode_reward)\n",
    "    logging.debug('train episode %d: reward = %.2f, steps = %d',\n",
    "            episode, episode_reward, elapsed_steps)\n",
    "    if np.mean(episode_rewards[-10:]) > -110:\n",
    "        break\n",
    "plt.plot(episode_rewards)\n",
    "plt.plot(episode_rewards)\n",
    "\n",
    "\n",
    "logging.info('==== test ====')\n",
    "episode_rewards = []\n",
    "for episode in range(100):\n",
    "    episode_reward, elapsed_steps = play_episode(env, agent)\n",
    "    episode_rewards.append(episode_reward)\n",
    "    logging.debug('test episode %d: reward = %.2f, steps = %d',\n",
    "            episode, episode_reward, elapsed_steps)\n",
    "logging.info('average episode reward = %.2f ± %.2f',\n",
    "        np.mean(episode_rewards), np.std(episode_rewards))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "env.close()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
